# 移除无效记录

import pandas as pd

inputfile = r"..\data\microwave_1selected.tsv"  # 筛选列后的数据
inputfile1 = r"..\data\microwave_2duplicate_reviews.tsv"  # 重复评论
outputfile = r"..\data\microwave_4removed.tsv"  # 剔除重复后的数据

m_data = pd.read_csv(inputfile, sep='\t', encoding='utf-8')
m_duplicate_reviews = pd.read_csv(inputfile1, sep='\t', encoding='utf-8')
m_duplicate_reviews = m_duplicate_reviews.head(2)

m_duplicate = m_data[~m_data.review_body.isin(m_duplicate_reviews['review_body'].values.tolist())]
m_duplicate.to_csv(outputfile, sep='\t', index=False, header=True, encoding='utf-8')
